<!DOCTYPE html>
<html lang="zh-cn">
<head>
  <meta charset="utf-8">
  <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
  <title>NLP学习笔记 - fangd123</title>
  <meta name="renderer" content="webkit" />
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1"/>

<meta http-equiv="Cache-Control" content="no-transform" />
<meta http-equiv="Cache-Control" content="no-siteapp" />

<meta name="theme-color" content="#f8f5ec" />
<meta name="msapplication-navbutton-color" content="#f8f5ec">
<meta name="apple-mobile-web-app-capable" content="yes">
<meta name="apple-mobile-web-app-status-bar-style" content="#f8f5ec">


<meta name="author" content="fangd123" /><meta name="description" content="文本相似度 构词法相似 词根提取 波特词干（Porter Streamming）提取算法 参考资料：波特词干（Porter Streamming）提取算" /><meta name="keywords" content="fangd123, 博客, 技术, 生活" />






<meta name="generator" content="Hugo 0.65.3 with theme even" />


<link rel="canonical" href="https://blog.fangd123.com/post/nlp-notes/" />
<link rel="apple-touch-icon" sizes="180x180" href="https://blog.fangd123.com/apple-touch-icon.png">
<link rel="icon" type="image/png" sizes="32x32" href="https://blog.fangd123.com/favicon-32x32.png">
<link rel="icon" type="image/png" sizes="16x16" href="https://blog.fangd123.com/favicon-16x16.png">
<link rel="manifest" href="https://blog.fangd123.com/manifest.json">
<link rel="mask-icon" href="https://blog.fangd123.com/safari-pinned-tab.svg" color="#5bbad5">

<script async src="//busuanzi.ibruce.info/busuanzi/2.3/busuanzi.pure.mini.js"></script>
<link href="https://blog.fangd123.com/dist/even.c2a46f00.min.css" rel="stylesheet">
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/@fancyapps/fancybox@3.1.20/dist/jquery.fancybox.min.css" integrity="sha256-7TyXnr2YU040zfSP+rEcz29ggW4j56/ujTPwjMzyqFY=" crossorigin="anonymous">


<meta property="og:title" content="NLP学习笔记" />
<meta property="og:description" content="文本相似度 构词法相似 词根提取 波特词干（Porter Streamming）提取算法 参考资料：波特词干（Porter Streamming）提取算" />
<meta property="og:type" content="article" />
<meta property="og:url" content="https://blog.fangd123.com/post/nlp-notes/" />
<meta property="article:published_time" content="2015-11-07T21:53:49-05:00" />
<meta property="article:modified_time" content="2015-11-07T21:53:49-05:00" />
<meta itemprop="name" content="NLP学习笔记">
<meta itemprop="description" content="文本相似度 构词法相似 词根提取 波特词干（Porter Streamming）提取算法 参考资料：波特词干（Porter Streamming）提取算">
<meta itemprop="datePublished" content="2015-11-07T21:53:49-05:00" />
<meta itemprop="dateModified" content="2015-11-07T21:53:49-05:00" />
<meta itemprop="wordCount" content="1228">



<meta itemprop="keywords" content="nlp,自然语言处理,coursera," /><meta name="twitter:card" content="summary"/>
<meta name="twitter:title" content="NLP学习笔记"/>
<meta name="twitter:description" content="文本相似度 构词法相似 词根提取 波特词干（Porter Streamming）提取算法 参考资料：波特词干（Porter Streamming）提取算"/>

<!--[if lte IE 9]>
  <script src="https://cdnjs.cloudflare.com/ajax/libs/classlist/1.1.20170427/classList.min.js"></script>
<![endif]-->

<!--[if lt IE 9]>
  <script src="https://cdn.jsdelivr.net/npm/html5shiv@3.7.3/dist/html5shiv.min.js"></script>
  <script src="https://cdn.jsdelivr.net/npm/respond.js@1.4.2/dest/respond.min.js"></script>
<![endif]-->

</head>
<body>
  <div id="mobile-navbar" class="mobile-navbar">
  <div class="mobile-header-logo">
    <a href="https://blog.fangd123.com/" class="logo">fangd123</a>
  </div>
  <div class="mobile-navbar-icon">
    <span></span>
    <span></span>
    <span></span>
  </div>
</div>
<nav id="mobile-menu" class="mobile-menu slideout-menu">
  <ul class="mobile-menu-list">
    <a href="https://blog.fangd123.com/">
        <li class="mobile-menu-item">主页</li>
      </a><a href="https://blog.fangd123.com/categories/%E6%8A%80%E6%9C%AF">
        <li class="mobile-menu-item">技术</li>
      </a><a href="https://blog.fangd123.com/categories/%E7%94%9F%E6%B4%BB">
        <li class="mobile-menu-item">生活</li>
      </a><a href="https://blog.fangd123.com/post/">
        <li class="mobile-menu-item">归档</li>
      </a><a href="https://blog.fangd123.com/tags/">
        <li class="mobile-menu-item">标签</li>
      </a><a href="https://blog.fangd123.com/categories/">
        <li class="mobile-menu-item">分类</li>
      </a>
  </ul>
</nav>
  <div class="container" id="mobile-panel">
    <header id="header" class="header">
        <div class="logo-wrapper">
  <a href="https://blog.fangd123.com/" class="logo">fangd123</a>
</div>

<nav class="site-navbar">
  <ul id="menu" class="menu">
    <li class="menu-item">
        <a class="menu-item-link" href="https://blog.fangd123.com/">主页</a>
      </li><li class="menu-item">
        <a class="menu-item-link" href="https://blog.fangd123.com/categories/%E6%8A%80%E6%9C%AF">技术</a>
      </li><li class="menu-item">
        <a class="menu-item-link" href="https://blog.fangd123.com/categories/%E7%94%9F%E6%B4%BB">生活</a>
      </li><li class="menu-item">
        <a class="menu-item-link" href="https://blog.fangd123.com/post/">归档</a>
      </li><li class="menu-item">
        <a class="menu-item-link" href="https://blog.fangd123.com/tags/">标签</a>
      </li><li class="menu-item">
        <a class="menu-item-link" href="https://blog.fangd123.com/categories/">分类</a>
      </li>
  </ul>
</nav>
    </header>

    <main id="main" class="main">
      <div class="content-wrapper">
        <div id="content" class="content">
          <article class="post">
    
    <header class="post-header">
      <h1 class="post-title">NLP学习笔记</h1>

      <div class="post-meta">
        <span class="post-time"> 2015-11-07 </span>
        <div class="post-category">
            <a href="https://blog.fangd123.com/categories/%E6%8A%80%E6%9C%AF/"> 技术 </a>
            </div>
          <span class="more-meta"> 约 1228 字 </span>
          <span class="more-meta"> 预计阅读 3 分钟 </span>
        <span id="busuanzi_container_page_pv" class="more-meta"> <span id="busuanzi_value_page_pv"><img src="https://blog.fangd123.com/img/spinner.svg" alt="spinner.svg"/></span> 次阅读 </span>
      </div>
    </header>

    <div class="post-toc" id="post-toc">
  <h2 class="post-toc-title">文章目录</h2>
  <div class="post-toc-content always-active">
    <nav id="TableOfContents">
  <ul>
    <li><a href="#文本相似度">文本相似度</a>
      <ul>
        <li><a href="#构词法相似">构词法相似</a></li>
        <li><a href="#语义相似">语义相似</a></li>
        <li><a href="#文档相似">文档相似</a></li>
        <li><a href="#维度缩减">维度缩减</a></li>
      </ul>
    </li>
    <li><a href="#nlp的任务">NLP的任务</a>
      <ul>
        <li><a href="#词性标注">词性标注</a></li>
        <li><a href="#信息提取">信息提取</a></li>
        <li><a href="#假阳性">假阳性</a></li>
        <li><a href="#语义一阶逻辑推断语义分析">语义：一阶逻辑，推断，语义分析</a></li>
        <li><a href="#语义消岐wsd">语义消岐（WSD）</a></li>
        <li><a href="#命名实体识别">命名实体识别</a></li>
        <li><a href="#语义角色标注">语义角色标注</a></li>
        <li><a href="#指代消解">指代消解</a></li>
        <li><a href="#回答问题">回答问题</a></li>
        <li><a href="#机器翻译">机器翻译</a></li>
        <li><a href="#文本摘要">文本摘要</a></li>
        <li><a href="#语音合成">语音合成</a></li>
        <li><a href="#蕴涵和转述">蕴涵和转述</a></li>
        <li><a href="#叙述分析">叙述分析</a></li>
        <li><a href="#对话系统">对话系统</a></li>
      </ul>
    </li>
  </ul>
</nav>
  </div>
</div>
    <div class="post-content">
      <h2 id="文本相似度">文本相似度</h2>
<h3 id="构词法相似">构词法相似</h3>
<h4 id="词根提取">词根提取</h4>
<h5 id="波特词干porter-streamming提取算法">波特词干（Porter Streamming）提取算法</h5>
<p><em>参考资料：<a href="http://blog.csdn.net/zhanghaiyang9999/article/details/41624007">波特词干（Porter Streamming）提取算法详解</a></em></p>
<p>英文词汇是由两部分构成，词干和词缀，词缀分为前缀和后缀，词干提取算法只涉及去除后缀操作。</p>
<p><em>算法论文原文：<a href="http://tartarus.org/~martin/PorterStemmer/def.txt"> An algorithm for suffix stripping</a></em></p>
<p>基于规则的算法：英文词汇都是由辅音字母和元音字母组成，并且一般是交替出现的规律。</p>
<p>算法步骤：</p>
<ol>
<li>直接对照规则匹配，对词汇进行第一次变形</li>
<li>判断元音辅音字幕出现次数，寻找对应匹配规则</li>
<li>根据事先的规则，从词尾开始，对词汇进行变形</li>
<li>跳转到第一步，若能寻找到，则继续</li>
<li>若寻找不到对应匹配规则，改算法结束</li>
</ol>
<p>适用范围：轻度提取比较准确，深度提取慎用。</p>
<p>实际应用：搜索引擎对于用户需要查找的词汇提取词干后匹配各种词汇变形，例如过去分词查找到过去式</p>
<h4 id="拼写相似">拼写相似</h4>
<h5 id="编辑距离">编辑距离</h5>
<p>用来衡量两个字符串之间的相似度</p>
<p><em>参考资料:<a href="http://blog.csdn.net/chndata/article/details/42552971">字符串相似度算法编辑距离Levenshtein Distance</a></em></p>
<p>这是一种动态规划算法，其中，插入、删除、替换的成本都是1</p>
<p><em>论文原文：<a href="http://www.let.rug.nl/~kleiweg/lev/">A program for aligning sentences in bilingual corpora. </a></em></p>
<p>算法步骤：</p>
<p><strong>待补充</strong></p>
<p>适用范围：空间复杂读为O(MN)，字符串较短的情况下能够取得比较好的性能，但是对于长字符串需要极大的空间</p>
<p>实际应用：生物序列相似度比较、抄袭检测、语音识别、拼写检查</p>
<h3 id="语义相似">语义相似</h3>
<h4 id="wordnet">Wordnet</h4>
<p>Wordnet是一个包含词汇之间关系的词汇数据库，词汇之间的主要关系是上位关系，因此更像一个树形结构的数据库</p>
<h5 id="基于分类词典的词汇相似度计算方法">基于分类词典的词汇相似度计算方法</h5>
<p>使用Wordnet tree进行路径相似度计算</p>
<p>两个单词之间的路径相似度（Philip Resnik）</p>
<p>Sim(v,w) = - log P(LCS(v,w))</p>
<p>LCS = lowest common subsumer</p>
<p>信息内容</p>
<p>Dekang Lin</p>
<ul>
<li>将Wordnet词汇增加概率参数</li>
<li>IC(c) = -log(c)</li>
<li>$Sim(v,w) = 2 \times log P(\dfrac{LCS(v,w)}{log P(v)+ log P(w)} )$</li>
</ul>
<h3 id="文档相似">文档相似</h3>
<h4 id="向量空间模型">向量空间模型</h4>
<p>Cos相似度</p>
<p>Cos测量两个向量之间的夹角</p>
<p>$\sigma(D,Q)=\frac{\Sigma(d_iq_i)}{\sqrt{\Sigma(d_i)^2}\sqrt{\Sigma(q_i)^2}}$</p>
<h4 id="分布相似度">分布相似度</h4>
<p>两个单词出现在相似的上下文中更有可能拥有语义联系</p>
<h4 id="关联强度">关联强度</h4>
<p>频次相关，我们需要忽略假的配对。然而，单独频次是不够的，一个通用的方法是使用逐点互信息（PMI）</p>
<p>w：单词
c:上下文特征</p>
<p>$PMI(w,c) = \frac{log P(w,c)}{P(w)P(c)}$</p>
<h3 id="维度缩减">维度缩减</h3>
<p><em>参考资料：<a href="http://molecular-service-science.com/2014/07/16/eigen-value-singular-value-decomposition-principal-component-analysis/">維度縮減Dimension Reduction，通往線性代數的聖母峰 : 特徵值分解(Eigenvalue Decomposition)、奇異值分解(Singular Value Decomposition) 與主成分分析(Principal Component Analysis) – 服務科學的分子廚房 Molecular Service Science</a></em></p>
<h5 id="简单的向量方法判断相似度的问题">简单的向量方法判断相似度的问题</h5>
<ul>
<li>一词多义</li>
<li>同义词</li>
<li>无关词</li>
<li>稀疏矩阵</li>
</ul>
<p><strong>维度缩减</strong>：寻找数据中隐藏的相似度，基于矩阵分解</p>
<p><strong>特征向量和特征值</strong></p>
<p><strong>奇异值分解SVD</strong></p>
<p><em>参考《数学之美》相关章节内容</em></p>
<p>对于矩阵进行分解，以便找出相对数量少却包含重要矩阵信息要素的新的矩阵，并用新的矩阵近似表示原矩阵，以达到以简驭繁或者减少噪音的目的</p>
<p><strong>需要补充一个具体的例子（等我实现了这个算法再说）</strong></p>
<p><strong>潜在语义索引（LSI）</strong></p>
<p>维度缩减 = 识别隐藏的概念
在潜在空间中查询匹配</p>
<h2 id="nlp的任务">NLP的任务</h2>
<h3 id="词性标注">词性标注</h3>
<h4 id="依存分析">依存分析</h4>
<h3 id="信息提取">信息提取</h3>
<h3 id="假阳性">假阳性</h3>
<h3 id="语义一阶逻辑推断语义分析">语义：一阶逻辑，推断，语义分析</h3>
<h3 id="语义消岐wsd">语义消岐（WSD）</h3>
<h3 id="命名实体识别">命名实体识别</h3>
<h3 id="语义角色标注">语义角色标注</h3>
<h3 id="指代消解">指代消解</h3>
<h3 id="回答问题">回答问题</h3>
<h3 id="机器翻译">机器翻译</h3>
<h3 id="文本摘要">文本摘要</h3>
<h3 id="语音合成">语音合成</h3>
<h3 id="蕴涵和转述">蕴涵和转述</h3>
<h3 id="叙述分析">叙述分析</h3>
<h3 id="对话系统">对话系统</h3>

    </div>

    <div class="post-copyright">
  <p class="copyright-item">
    <span class="item-title">文章作者</span>
    <span class="item-content">fangd123</span>
  </p>
  <p class="copyright-item">
    <span class="item-title">上次更新</span>
    <span class="item-content">
        2015-11-07
        
    </span>
  </p>
  
  
</div>
<footer class="post-footer">
      <div class="post-tags">
          <a href="https://blog.fangd123.com/tags/nlp/">nlp</a>
          <a href="https://blog.fangd123.com/tags/%E8%87%AA%E7%84%B6%E8%AF%AD%E8%A8%80%E5%A4%84%E7%90%86/">自然语言处理</a>
          <a href="https://blog.fangd123.com/tags/coursera/">coursera</a>
          </div>
      <nav class="post-nav">
        <a class="prev" href="https://blog.fangd123.com/post/vagrant-create-environment-for-web-development/">
            <i class="iconfont icon-left"></i>
            <span class="prev-text nav-default">Vagrant打造跨平台零配置Web开发环境问题和解决方案汇总</span>
            <span class="prev-text nav-mobile">上一篇</span>
          </a>
        <a class="next" href="https://blog.fangd123.com/post/%E6%8B%A5%E6%8A%B1%E5%8F%98%E5%8C%96%E4%BB%A5%E4%B8%8D%E5%8F%98%E5%BA%94%E4%B8%87%E5%8F%98/">
            <span class="next-text nav-default">拥抱变化，以不变应万变</span>
            <span class="next-text nav-mobile">下一篇</span>
            <i class="iconfont icon-right"></i>
          </a>
      </nav>
    </footer>
  </article>
        </div>
        <div id="gitalk-container"></div>
    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/gitalk@1/dist/gitalk.css" crossorigin="anonymous">
    <script src="https://cdn.jsdelivr.net/npm/gitalk@1/dist/gitalk.min.js" crossorigin="anonymous"></script>
    <script type="text/javascript">
      var gitalk = new Gitalk({
        id: '2015-11-07 21:53:49 -0500 -0500',
        title: 'NLP学习笔记',
        clientID: 'e7701fe6c1c3f1819cca',
        clientSecret: '0279597bf6b736da28a66ac649db4851303b9d95',
        repo: 'fangd123.github.io',
        owner: 'fangd123',
        admin: ['fangd123'],
        body: decodeURI(location.href)
      });
      gitalk.render('gitalk-container');
    </script>
    <noscript>Please enable JavaScript to view the <a href="https://github.com/gitalk/gitalk">comments powered by gitalk.</a></noscript>

  

  

      </div>
    </main>

    <footer id="footer" class="footer">
      <div class="social-links">
      <a href="mailto:fangd123@gmail.com" class="iconfont icon-email" title="email"></a>
      <a href="https://github.com/fangd123" class="iconfont icon-github" title="github"></a>
      <a href="https://weibo.com/p/1005051761690660/" class="iconfont icon-weibo" title="weibo"></a>
      <a href="https://www.zhihu.com/people/fang-wen-da" class="iconfont icon-zhihu" title="zhihu"></a>
      <a href="http://localhost:1313" class="iconfont icon-gitlab" title="gitlab"></a>
      <a href="http://localhost:1313" class="iconfont icon-bilibili" title="bilibili"></a>
  <a href="https://blog.fangd123.com/index.xml" type="application/rss+xml" class="iconfont icon-rss" title="rss"></a>
</div>

<div class="copyright">
  <span class="power-by">
    由 <a class="hexo-link" href="https://gohugo.io">Hugo</a> 强力驱动
  </span>
  <span class="division">|</span>
  <span class="theme-info">
    主题 - 
    <a class="theme-link" href="https://github.com/olOwOlo/hugo-theme-even">Even</a>
  </span>

  <div class="busuanzi-footer">
    <span id="busuanzi_container_site_pv"> 本站总访问量 <span id="busuanzi_value_site_pv"><img src="https://blog.fangd123.com/img/spinner.svg" alt="spinner.svg"/></span> 次 </span>
      <span class="division">|</span>
    <span id="busuanzi_container_site_uv"> 本站总访客数 <span id="busuanzi_value_site_uv"><img src="https://blog.fangd123.com/img/spinner.svg" alt="spinner.svg"/></span> 人 </span>
  </div>

  <span class="copyright-year">
    &copy; 
    2014 - 
    2020
    <span class="heart">
      <i class="iconfont icon-heart"></i>
    </span>
    <span class="author">fangd123</span>
  </span>
</div>
    </footer>

    <div class="back-to-top" id="back-to-top">
      <i class="iconfont icon-up"></i>
    </div>
  </div>
  
  <script src="https://cdn.jsdelivr.net/npm/jquery@3.2.1/dist/jquery.min.js" integrity="sha256-hwg4gsxgFZhOsEEamdOYGBf13FyQuiTwlAQgxVSNgt4=" crossorigin="anonymous"></script>
  <script src="https://cdn.jsdelivr.net/npm/slideout@1.0.1/dist/slideout.min.js" integrity="sha256-t+zJ/g8/KXIJMjSVQdnibt4dlaDxc9zXr/9oNPeWqdg=" crossorigin="anonymous"></script>
  <script src="https://cdn.jsdelivr.net/npm/@fancyapps/fancybox@3.1.20/dist/jquery.fancybox.min.js" integrity="sha256-XVLffZaxoWfGUEbdzuLi7pwaUJv1cecsQJQqGLe7axY=" crossorigin="anonymous"></script>
<script type="text/javascript" src="https://blog.fangd123.com/dist/even.26188efa.min.js"></script>








</body>
</html>
